Insurance Prediction. Data analysis and modeling.

1. Basic Data Analysis

In [1]:
!pip install pyod
Collecting pyod

  Downloading pyod-0.8.4.tar.gz (98 kB)

     |████████████████████████████████| 98 kB 1.1 MB/s 

Collecting combo

  Downloading combo-0.1.1.tar.gz (37 kB)

Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from pyod) (0.14.1)

Requirement already satisfied: matplotlib in /opt/conda/lib/python3.7/site-packages (from pyod) (3.2.1)

Requirement already satisfied: numpy>=1.13 in /opt/conda/lib/python3.7/site-packages (from pyod) (1.18.5)

Requirement already satisfied: numba>=0.35 in /opt/conda/lib/python3.7/site-packages (from pyod) (0.48.0)

Requirement already satisfied: pandas>=0.25 in /opt/conda/lib/python3.7/site-packages (from pyod) (1.1.1)

Requirement already satisfied: scipy>=0.19.1 in /opt/conda/lib/python3.7/site-packages (from pyod) (1.4.1)

Requirement already satisfied: scikit_learn>=0.19.1 in /opt/conda/lib/python3.7/site-packages (from pyod) (0.23.2)

Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from pyod) (1.14.0)

Requirement already satisfied: statsmodels in /opt/conda/lib/python3.7/site-packages (from pyod) (0.11.1)

Collecting suod

  Downloading suod-0.0.4.tar.gz (2.1 MB)

     |████████████████████████████████| 2.1 MB 4.6 MB/s 

Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib->pyod) (2.8.1)

Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib->pyod) (2.4.7)

Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.7/site-packages (from matplotlib->pyod) (0.10.0)

Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib->pyod) (1.2.0)

Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /opt/conda/lib/python3.7/site-packages (from numba>=0.35->pyod) (0.31.0)

Requirement already satisfied: setuptools in /opt/conda/lib/python3.7/site-packages (from numba>=0.35->pyod) (46.1.3.post20200325)

Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.25->pyod) (2019.3)

Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from scikit_learn>=0.19.1->pyod) (2.1.0)

Requirement already satisfied: patsy>=0.5 in /opt/conda/lib/python3.7/site-packages (from statsmodels->pyod) (0.5.1)

Building wheels for collected packages: pyod, combo, suod

  Building wheel for pyod (setup.py) ... - \ done

  Created wheel for pyod: filename=pyod-0.8.4-py3-none-any.whl size=112081 sha256=81718f7fe7e4fb29dab35480ea26a115b7b7e4f6eae8e5ce90864f581d936ab5

  Stored in directory: /root/.cache/pip/wheels/9f/c9/60/b1311d6e5480f83f29e88bc6223ee1f011e0989a817ad01b65

  Building wheel for combo (setup.py) ... - \ done

  Created wheel for combo: filename=combo-0.1.1-py3-none-any.whl size=42113 sha256=12b1d7f8d0bd9e44a7e10b441bfa511586a609cb61447cb3ec73518184e8c2f0

  Stored in directory: /root/.cache/pip/wheels/3e/e1/f8/08f19ba48f75d3dbbb549cec4b86cc0392c14b2b6bb81f4e1f

  Building wheel for suod (setup.py) ... - \ | / done

  Created wheel for suod: filename=suod-0.0.4-py3-none-any.whl size=2167157 sha256=82e3f599b15c5d122ccf7adecf75c7dac621b679e7b3342ec79258cb645b008c

  Stored in directory: /root/.cache/pip/wheels/dc/ae/aa/3b8cc857617f3ba6cb9e6b804c79c69d0ed60a08e022e9a4f3

Successfully built pyod combo suod

Installing collected packages: combo, suod, pyod

Successfully installed combo-0.1.1 pyod-0.8.4 suod-0.0.4

WARNING: You are using pip version 20.2.2; however, version 20.2.4 is available.

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.

In [2]:
import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.model_selection import KFold, train_test_split

import optuna
from optuna.samplers import TPESampler

from pyod.models.copod import COPOD

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.utils import to_categorical

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
In [3]:
train = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')
test = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/test.csv')
In [4]:
Out[4]:
id Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 Male 44 1 28.0 0 > 2 Years Yes 40454.0 26.0 217 1
1 2 Male 76 1 3.0 0 1-2 Year No 33536.0 26.0 183 0
2 3 Male 47 1 28.0 0 > 2 Years Yes 38294.0 26.0 27 1
3 4 Male 21 1 11.0 1 < 1 Year No 28619.0 152.0 203 0
4 5 Female 29 1 41.0 1 < 1 Year No 27496.0 152.0 39 0
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 381105 Male 74 1 26.0 1 1-2 Year No 30170.0 26.0 88 0
381105 381106 Male 30 1 37.0 1 < 1 Year No 40016.0 152.0 131 0
381106 381107 Male 21 1 30.0 1 < 1 Year No 35118.0 160.0 161 0
381107 381108 Female 68 1 14.0 0 > 2 Years Yes 44617.0 124.0 74 0
381108 381109 Male 46 1 29.0 0 1-2 Year No 41777.0 26.0 237 0

381109 rows × 12 columns

In [5]:
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)
In [6]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Bar(
        x=['Male', 'Female'], 
        y=[
            len(train[train['Gender']=='Male']),
            len(train[train['Gender']=='Female'])
        ], 
        name='Train Gender',
        text = [
            str(round(100 * len(train[train['Gender']=='Male']) / len(train), 2)) + '%',
            str(round(100 * len(train[train['Gender']=='Female']) / len(train), 2)) + '%'
        ],
        textposition='auto'
    ),
    go.Bar(
        x=['Male', 'Female'], 
        y=[
            len(test[test['Gender']=='Male']),
            len(test[test['Gender']=='Female'])
        ], 
        name='Test Gender',
        text=[
            str(round(100 * len(test[test['Gender']=='Male']) / len(test), 2)) + '%',
            str(round(100 * len(test[test['Gender']=='Female']) / len(test), 2)) + '%'
        ],
        textposition='auto'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  +1
    )

fig.update_layout(
    title_text='Train/test gender column',
    height=400,
    width=700
)

fig.show()
In [7]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Bar(
        x=['Yes', 'No'], 
        y=[
            len(train[train['Driving_License']==1]),
            len(train[train['Driving_License']==0])
        ], 
        name='Train Driving_License',
        text = [
            str(round(100 * len(train[train['Driving_License']==1]) / len(train), 2)) + '%',
            str(round(100 * len(train[train['Driving_License']==0]) / len(train), 2)) + '%'
        ],
        textposition='auto'
    ),
    go.Bar(
        x=['Yes', 'No'], 
        y=[
            len(test[test['Driving_License']==1]),
            len(test[test['Driving_License']==0])
        ], 
        name='Test Driving_License',
        text=[
            str(round(100 * len(test[test['Driving_License']==1]) / len(test), 2)) + '%',
            str(round(100 * len(test[test['Driving_License']==0]) / len(test), 2)) + '%'
        ],
        textposition='auto'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train/test Driving_License column',
    height=400,
    width=700
)

fig.show()
In [8]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Bar(
        x=['Yes', 'No'], 
        y=[
            len(train[train['Previously_Insured']==1]),
            len(train[train['Previously_Insured']==0])
        ], 
        name='Train Previously_Insured',
        text = [
            str(round(100 * len(train[train['Previously_Insured']==1]) / len(train), 2)) + '%',
            str(round(100 * len(train[train['Previously_Insured']==0]) / len(train), 2)) + '%'
        ],
        textposition='auto'
    ),
    go.Bar(
        x=['Yes', 'No'], 
        y=[
            len(test[test['Previously_Insured']==1]),
            len(test[test['Previously_Insured']==0])
        ], 
        name='Test Previously_Insured',
        text = [
            str(round(100 * len(test[test['Previously_Insured']==1]) / len(test), 2)) + '%',
            str(round(100 * len(test[test['Previously_Insured']==0]) / len(test), 2)) + '%'
        ],
        textposition='auto'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train/test Previously_Insured column',
    height=400,
    width=700
)

fig.show()
In [9]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Bar(
        x=['Yes', 'No'], 
        y=[
            len(train[train['Vehicle_Damage']=='Yes']),
            len(train[train['Vehicle_Damage']=='No'])
        ], 
        name='Train Vehicle_Damage',
        text = [
            str(round(100 * len(train[train['Vehicle_Damage']=='Yes']) / len(train), 2)) + '%',
            str(round(100 * len(train[train['Vehicle_Damage']=='No']) / len(train), 2)) + '%'
        ],
        textposition='auto'
    ),
    go.Bar(
        x=['Yes', 'No'], 
        y=[
            len(test[test['Vehicle_Damage']=='Yes']),
            len(test[test['Vehicle_Damage']=='No'])
        ], 
        name='Test Vehicle_Damage',
        text = [
            str(round(100 * len(test[test['Vehicle_Damage']=='Yes']) / len(test), 2)) + '%',
            str(round(100 * len(test[test['Vehicle_Damage']=='No']) / len(test), 2)) + '%'
        ],
        textposition='auto'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train/test Vehicle_Damage column',
    height=400,
    width=700
)

fig.show()
In [10]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Bar(
        x=['> 2 Years', '1-2 Year', '< 1 Year'], 
        y=[
            len(train[train['Vehicle_Age']=='> 2 Years']),
            len(train[train['Vehicle_Age']=='1-2 Year']),
            len(train[train['Vehicle_Age']=='< 1 Year'])
        ], 
        name='Train Vehicle_Age',
        text = [
            str(round(100 * len(train[train['Vehicle_Age']=='> 2 Years']) / len(train), 2)) + '%',
            str(round(100 * len(train[train['Vehicle_Age']=='1-2 Year']) / len(train), 2)) + '%',
            str(round(100 * len(train[train['Vehicle_Age']=='< 1 Year']) / len(train), 2)) + '%'
        ],
        textposition='auto'
    ),
    go.Bar(
        x=['> 2 Years', '1-2 Year', '< 1 Year'], 
        y=[
            len(test[test['Vehicle_Age']=='> 2 Years']),
            len(test[test['Vehicle_Age']=='1-2 Year']),
            len(test[test['Vehicle_Age']=='< 1 Year'])
        ], 
        name='Test Vehicle_Age',
        text = [
            str(round(100 * len(test[test['Vehicle_Age']=='> 2 Years']) / len(test), 2)) + '%',
            str(round(100 * len(test[test['Vehicle_Age']=='1-2 Year']) / len(test), 2)) + '%',
            str(round(100 * len(test[test['Vehicle_Age']=='< 1 Year']) / len(test), 2)) + '%'
        ],
        textposition='auto'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train/test Vehicle_Age column',
    height=400,
    width=700
)

fig.show()
In [11]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Histogram(
        x=train['Age'], 
        name='Train Age'
    ),
    go.Histogram(
        x=test['Age'], 
        name='Test Age'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train/test Age column distribution',
    height=500,
    width=900
)

fig.show()
In [12]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Histogram(
        x=train['Annual_Premium'], 
        name='Train Annual_Premium'
    ),
    go.Histogram(
        x=test['Annual_Premium'], 
        name='Test Annual_Premium'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train/test Annual_Premium column distribution',
    height=500,
    width=800
)

fig.show()
In [13]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Histogram(
        x=train['Policy_Sales_Channel'], 
        name='Train Policy_Sales_Channel'
    ),
    go.Histogram(
        x=test['Policy_Sales_Channel'], 
        name='Test Policy_Sales_Channel'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  +1
    )

fig.update_layout(
    title_text='Train/test Policy_Sales_Channel column distribution',
    height=500,
    width=800
)

fig.show()
In [14]:
fig = make_subplots(rows=1, cols=2)

traces = [
    go.Histogram(
        x=train['Vintage'], 
        name='Train Vintage'
    ),
    go.Histogram(
        x=test['Vintage'], 
        name='Test Vintage'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train/test Vintage column distribution',
    height=500,
    width=800
)

fig.show()
In [15]:
tr = train['Region_Code'].value_counts().reset_index()
x_tr = tr['index'].tolist()
y_tr = tr['Region_Code'].tolist()
te = test['Region_Code'].value_counts().reset_index()
x_te = te['index'].tolist()
y_te = te['Region_Code'].tolist()

fig = make_subplots(rows=2, cols=1)

traces = [
    go.Bar(
        x=x_tr, 
        y=y_tr, 
        name='Train Region_Code'
    ),
    go.Bar(
        x=x_te, 
        y=y_te, 
        name='Test Region_Code'
    )
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 1) + 1, 
        (i % 1)  + 1
    )

fig.update_layout(
    title_text='Train / test Region_Code',
    height=900,
    width=800
)

fig.show()
In [16]:
fig = make_subplots(rows=1, cols=1)

traces = [
    go.Bar(
        x=['Yes', 'No'], 
        y=[
            len(train[train['Response']==1]),
            len(train[train['Response']==0])
        ], 
        name='Train Response'
    ),
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train Response column',
    height=400,
    width=400
)

fig.show()

As we can see from initial analysis all columns presented in dataset have exactly the same ditribution. Let's do feature engineering and modeling next.

In [17]:
fig = px.histogram(
    train, 
    "Age", 
    color='Response',
    nbins=100, 
    title='Age & Response ditribution', 
    width=700,
    height=500
)

fig.show()
In [18]:
fig = px.histogram(
    train[train['Response'] == 1], 
    "Age", 
    nbins=100, 
    title='Age distribution for positive response', 
    width=700,
    height=500
)

fig.show()
In [19]:
fig = make_subplots(
    rows=1, 
    cols=2
)

traces = [
    go.Bar(
        x=['Declined', 'Accepted'], 
        y=[
            len(train[(train['Gender']=='Male') & (train['Response']==0)]),
            len(train[(train['Gender']=='Male') & (train['Response']==1)])
        ], 
        name='Gender: Male'
    ),
    go.Bar(
        x=['Declined', 'Accepted'],  
        y=[
            len(train[(train['Gender']=='Female') & (train['Response']==0)]),
            len(train[(train['Gender']=='Female') & (train['Response']==1)])
        ], 
        name='Gender: Female'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train gender/response dependencies',
    height=400,
    width=700
)

fig.show()
In [20]:
fig = make_subplots(
    rows=1, 
    cols=2
)

traces = [
    go.Bar(
        x=['Declined', 'Accepted'], 
        y=[
            len(train[(train['Previously_Insured']==0) & (train['Response']==0)]),
            len(train[(train['Previously_Insured']==0) & (train['Response']==1)])
        ], 
        name='Previously_Insured: Previously Not Insured'
    ),
    go.Bar(
        x=['Declined', 'Accepted'],  
        y=[
            len(train[(train['Previously_Insured']==1) & (train['Response']==0)]),
            len(train[(train['Previously_Insured']==1) & (train['Response']==1)])
        ], 
        name='Previously_Insured: Previously Insured'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train previously_insured/response dependencies',
    height=400,
    width=700
)

fig.show()
In [21]:
fig = make_subplots(
    rows=1, 
    cols=2
)

traces = [
    go.Bar(
        x=['Declined', 'Accepted'], 
        y=[
            len(train[(train['Vehicle_Damage']=='No') & (train['Response']==0)]),
            len(train[(train['Vehicle_Damage']=='No') & (train['Response']==1)])
        ], 
        name='Vehicle_Damage: No'
    ),
    go.Bar(
        x=['Declined', 'Accepted'],  
        y=[
            len(train[(train['Vehicle_Damage']=='Yes') & (train['Response']==0)]),
            len(train[(train['Vehicle_Damage']=='Yes') & (train['Response']==1)])
        ], 
        name='Vehicle_Damage: Yes'
    ),

]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2)  + 1
    )

fig.update_layout(
    title_text='Train vehicle_damage/response dependencies',
    height=400,
    width=700
)

fig.show()
In [22]:
fig = make_subplots(
    rows=1, 
    cols=3
)

traces = [
    go.Bar(
        x=['Declined', 'Accepted'], 
        y=[
            len(train[(train['Vehicle_Age']=='> 2 Years') & (train['Response']==0)]),
            len(train[(train['Vehicle_Age']=='> 2 Years') & (train['Response']==1)])
        ], 
        name='Vehicle_Age: > 2 Years'
    ),
    go.Bar(
        x=['Declined', 'Accepted'], 
        y=[
            len(train[(train['Vehicle_Age']=='1-2 Year') & (train['Response']==0)]),
            len(train[(train['Vehicle_Age']=='1-2 Year') & (train['Response']==1)])
        ], 
        name='Vehicle_Age: 1-2 Year'
    ),
    go.Bar(
        x=['Declined', 'Accepted'], 
        y=[
            len(train[(train['Vehicle_Age']=='< 1 Year') & (train['Response']==0)]),
            len(train[(train['Vehicle_Age']=='< 1 Year') & (train['Response']==1)])
        ], 
        name='Vehicle_Age: < 1 Year'
    ),

]

for i in range(len(traces)):
    fig.append_trace(traces[i], (i // 3) + 1, (i % 3)  +1)

fig.update_layout(
    title_text='Train/test Vehicle_Age/Response dependencies',
    height=400,
    width=800
)

fig.show()
In [23]:
fig = px.histogram(
    train, 
    "Annual_Premium", 
    color='Response',
    nbins=100, 
    title='Annual_Premium & Response ditribution', 
    width=700,
    height=500
)
fig.show()
In [24]:
fig = px.histogram(
    train[train['Response'] == 1], 
    "Annual_Premium", 
    nbins=100, 
    title='Annual_Premium distribution for positive response', 
    width=700,
    height=500
)

fig.show()
In [25]:
fig = px.histogram(
    train, 
    "Vintage", 
    color='Response',
    nbins=100, 
    title='Vintage & Response ditribution', 
    width=700,
    height=500
)

fig.show()
In [26]:
fig = px.histogram(
    train[train['Response'] == 1], 
    "Vintage", 
    nbins=100, 
    title='Vintage distribution for positive response', 
    width=700,
    height=500
)
fig.show()

2. Feature Engineering

1) Convert columns with text values

In [27]:
train.loc[train['Gender'] == 'Male', 'Gender'] = 1
train.loc[train['Gender'] == 'Female', 'Gender'] = 0
test.loc[test['Gender'] == 'Male', 'Gender'] = 1
test.loc[test['Gender'] == 'Female', 'Gender'] = 0

train.loc[train['Vehicle_Age'] == '> 2 Years', 'Vehicle_Age'] = 2
train.loc[train['Vehicle_Age'] == '1-2 Year', 'Vehicle_Age'] = 1
train.loc[train['Vehicle_Age'] == '< 1 Year', 'Vehicle_Age'] = 0
test.loc[test['Vehicle_Age'] == '> 2 Years', 'Vehicle_Age'] = 2
test.loc[test['Vehicle_Age'] == '1-2 Year', 'Vehicle_Age'] = 1
test.loc[test['Vehicle_Age'] == '< 1 Year', 'Vehicle_Age'] = 0

train.loc[train['Vehicle_Damage'] == 'Yes', 'Vehicle_Damage'] = 1
train.loc[train['Vehicle_Damage'] == 'No', 'Vehicle_Damage'] = 0
test.loc[test['Vehicle_Damage'] == 'Yes', 'Vehicle_Damage'] = 1
test.loc[test['Vehicle_Damage'] == 'No', 'Vehicle_Damage'] = 0
In [28]:
for col in train.columns:
    train[col] = train[col].astype(np.int32)

train
Out[28]:
Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response
0 1 44 1 28 0 2 1 40454 26 217 1
1 1 76 1 3 0 1 0 33536 26 183 0
2 1 47 1 28 0 2 1 38294 26 27 1
3 1 21 1 11 1 0 0 28619 152 203 0
4 0 29 1 41 1 0 0 27496 152 39 0
... ... ... ... ... ... ... ... ... ... ... ...
381104 1 74 1 26 1 1 0 30170 26 88 0
381105 1 30 1 37 1 0 0 40016 152 131 0
381106 1 21 1 30 1 0 0 35118 160 161 0
381107 0 68 1 14 0 2 1 44617 124 74 0
381108 1 46 1 29 0 1 0 41777 26 237 0

381109 rows × 11 columns

In [29]:
f = plt.figure(
    figsize=(13, 11)
)

plt.matshow(
    train.corr(), 
    fignum=f.number
)

plt.xticks(
    range(train.shape[1]), 
    train.columns, 
    fontsize=14, 
    rotation=75
)

plt.yticks(
    range(train.shape[1]), 
    train.columns, 
    fontsize=14
)

cb = plt.colorbar()

cb.ax.tick_params(
    labelsize=14
)

Correlation for every feature with target

In [30]:
for col in train.columns:
    if col == 'Response':
        continue
    print(col, train[col].corr(train['Response']))
Gender 0.052439913771342224
Age 0.11114689471251052
Driving_License 0.010155174594073956
Region_Code 0.010569855615223145
Previously_Insured -0.34117046261352474
Vehicle_Age 0.2218739872179901
Vehicle_Damage 0.3543995438797554
Annual_Premium 0.022574695542560096
Policy_Sales_Channel -0.13904150082916147
Vintage -0.001050372001989892
In [31]:
fig = px.scatter(
    train, 
    x="Annual_Premium", 
    y="Age", 
    color="Response",
    width=600,
    height=600,
    title='Annual_premium vs Age scatter'
)

fig.show()

3. Modeling

Let's try unsupervised learning first. We will us kmeans clustering algorithm to check scores.

In [32]:
X = train.drop(['Response'], axis=1)
y = train['Response']
In [33]:
kmeans = KMeans(
    n_clusters=2, 
    random_state=666
).fit(X)
In [34]:
train['cluster'] = kmeans.labels_
train
Out[34]:
Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage Response cluster
0 1 44 1 28 0 2 1 40454 26 217 1 0
1 1 76 1 3 0 1 0 33536 26 183 0 0
2 1 47 1 28 0 2 1 38294 26 27 1 0
3 1 21 1 11 1 0 0 28619 152 203 0 0
4 0 29 1 41 1 0 0 27496 152 39 0 0
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 1 74 1 26 1 1 0 30170 26 88 0 0
381105 1 30 1 37 1 0 0 40016 152 131 0 0
381106 1 21 1 30 1 0 0 35118 160 161 0 0
381107 0 68 1 14 0 2 1 44617 124 74 0 0
381108 1 46 1 29 0 1 0 41777 26 237 0 0

381109 rows × 12 columns

In [35]:
train['cluster'].value_counts()
Out[35]:
0    309145
1     71964
Name: cluster, dtype: int64
In [36]:
print('Kmeans accuracy: ', accuracy_score(train['Response'], train['cluster']))
print('Kmeans f1_score: ', f1_score(train['Response'], train['cluster']))
Kmeans accuracy:  0.7362591804444398
Kmeans f1_score:  0.15302425131031228

Now let's try to use COPOD anomaly detection model and check results

In [37]:
response = train['Response']
train = train.drop(['Response', 'cluster'], axis=1)
In [38]:
clf = COPOD(
    contamination=0.15
)
clf.fit(train)
Out[38]:
COPOD(contamination=0.15)
In [39]:
cluster = clf.predict(train)
train['cluster'] = cluster
train['Response'] = response
train
Out[39]:
Gender Age Driving_License Region_Code Previously_Insured Vehicle_Age Vehicle_Damage Annual_Premium Policy_Sales_Channel Vintage cluster Response
0 1 44 1 28 0 2 1 40454 26 217 0 1
1 1 76 1 3 0 1 0 33536 26 183 1 0
2 1 47 1 28 0 2 1 38294 26 27 0 1
3 1 21 1 11 1 0 0 28619 152 203 0 0
4 0 29 1 41 1 0 0 27496 152 39 0 0
... ... ... ... ... ... ... ... ... ... ... ... ...
381104 1 74 1 26 1 1 0 30170 26 88 0 0
381105 1 30 1 37 1 0 0 40016 152 131 0 0
381106 1 21 1 30 1 0 0 35118 160 161 0 0
381107 0 68 1 14 0 2 1 44617 124 74 1 0
381108 1 46 1 29 0 1 0 41777 26 237 0 0

381109 rows × 12 columns

In [40]:
train['cluster'].value_counts()
Out[40]:
0    323942
1     57167
Name: cluster, dtype: int64
In [41]:
print('COPOD accuracy: ', accuracy_score(train['Response'], train['cluster']))
print('COPOD f1_score: ', f1_score(train['Response'], train['cluster']))
COPOD accuracy:  0.7634246370460944
COPOD f1_score:  0.1320407789982383

Let's build our first version of classifier and use Logistic Regression

Now we will create validation set

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
In [43]:
print('Positive cases % in validation set: ', round(100 * len(y_test[y_test == 1]) / len(y_test), 3), '%')
print('Positive cases % in train set: ', round(100 * len(y_train[y_train == 1]) / len(y_train), 3), '%')
Positive cases % in validation set:  12.343 %
Positive cases % in train set:  12.235 %

So we can see that our sets are well balanced by target column and we can use our validation set for testing.

In [44]:
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:764: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Out[44]:
LogisticRegression(random_state=666)
In [45]:
preds = model.predict(X_test)
print('Simple Logistic Regression accuracy: ', accuracy_score(y_test, preds))
print('Simple Logistic Regression f1_score: ', f1_score(y_test, preds))
Simple Logistic Regression accuracy:  0.8738815565059956
Simple Logistic Regression f1_score:  0.06913914980149123
In [46]:
def plot_confusion_matrix(y_real, y_pred):
    cm = confusion_matrix(y_real, y_pred)

    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, fmt='g')

    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
In [48]:
X_train = X_train.drop(['Region_Code', 'Vintage', 'Driving_License'], axis=1)
X_test = X_test.drop(['Region_Code', 'Vintage', 'Driving_License'], axis=1)
In [49]:
Out[49]:
LogisticRegression(random_state=666)
In [50]:
preds = model.predict(X_test)
print('Simple Logistic Regression accuracy: ', accuracy_score(y_test, preds))
print('Simple Logistic Regression f1_score: ', f1_score(y_test, preds))
Simple Logistic Regression accuracy:  0.8708640549972448
Simple Logistic Regression f1_score:  0.10526315789473685

After removing some columns predictions become better but still not good.

Let's build LightGBM with default parameters

In [52]:
model = LGBMClassifier(random_state=666)
model.fit(X_train, y_train)

preds = model.predict(X_test)
print('Simple LGBM accuracy: ', accuracy_score(y_test, preds))
print('Simple LGBM Regression f1_score: ', f1_score(y_test, preds))
Simple LGBM accuracy:  0.8762168402823332
Simple LGBM Regression f1_score:  0.00506168933881683
In [53]:
np.random.seed(666)
sampler = TPESampler(seed=0)

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 20)
    n_estimators = trial.suggest_int("n_estimators", 1, 400)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0000001, 1)
    gamma = trial.suggest_uniform('gamma', 0.0000001, 1)
    scale_pos_weight = trial.suggest_int("scale_pos_weight", 1, 20)
    model = XGBClassifier(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        gamma=gamma, 
        scale_pos_weight=scale_pos_weight, 
        random_state=0
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = f1_score(y_test, preds)
    return score

#study = optuna.create_study(direction="maximize", sampler=sampler)
#study.optimize(objective, n_trials=500)

#xgb_params = study.best_params
xgb_params = {
    'max_depth': 4, 
    'n_estimators': 372, 
    'learning_rate': 0.09345905554110154, 
    'gamma': 0.6641238000625036, 
    'scale_pos_weight': 4
}
xgb_params['random_state'] = 0
xgb = XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train)
preds = xgb.predict(X_test)
print('Optimized XGBClassifier accuracy: ', accuracy_score(y_test, preds))
print('Optimized XGBClassifier f1-score', f1_score(y_test, preds))
Optimized XGBClassifier accuracy:  0.7639001862979192
Optimized XGBClassifier f1-score 0.45928730244576643
In [55]:
def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 7)
    n_estimators = trial.suggest_int("n_estimators", 2, 200)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    model = RandomForestClassifier(
        min_samples_leaf=min_samples_leaf, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        random_state=0
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = f1_score(y_test, preds)
    return score

study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)
rf_params = study.best_params
rf = RandomForestClassifier(**rf_params)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
print('Optimized RF accuracy: ', accuracy_score(y_test, preds))
print('Optimized RF f1-score:', f1_score(y_test, preds))
[I 2020-11-28 23:04:21,018] A new study created in memory with name: no-name-8593fdf0-8c6a-46e7-a9fd-8f824491a66e
[I 2020-11-28 23:04:31,018] Trial 0 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 49, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:04:37,821] Trial 1 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 69, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:04:40,707] Trial 2 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 23, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:04:59,385] Trial 3 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 89, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:05:07,958] Trial 4 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 90, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:05:15,685] Trial 5 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 83, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:05:25,479] Trial 6 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 79, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:05:44,942] Trial 7 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 150, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:06:04,359] Trial 8 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 199, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:06:20,695] Trial 9 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 101, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:06:23,123] Trial 10 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 10, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:06:32,773] Trial 11 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 44, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:06:42,285] Trial 12 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 50, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:07:13,804] Trial 13 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 125, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:07:20,795] Trial 14 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 38, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:07:35,190] Trial 15 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 66, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:07:35,671] Trial 16 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 3, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:07:41,039] Trial 17 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 28, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:08:11,497] Trial 18 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 142, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:08:12,475] Trial 19 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 6, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:08:17,128] Trial 20 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 25, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:08:50,238] Trial 21 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 159, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:09:30,914] Trial 22 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 177, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:09:32,773] Trial 23 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 12, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:09:56,256] Trial 24 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 111, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:10:40,739] Trial 25 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 194, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:11:22,502] Trial 26 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 178, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:11:39,974] Trial 27 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 114, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:12:10,579] Trial 28 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 129, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:12:58,557] Trial 29 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 199, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:13:10,576] Trial 30 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 65, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:13:40,151] Trial 31 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 126, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:14:08,075] Trial 32 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 118, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:14:21,600] Trial 33 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 65, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:14:33,576] Trial 34 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 66, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:14:37,879] Trial 35 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 23, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:15:13,984] Trial 36 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 169, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:15:50,003] Trial 37 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 169, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:16:10,239] Trial 38 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 153, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:16:39,240] Trial 39 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 185, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:17:00,668] Trial 40 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 100, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:17:46,504] Trial 41 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 196, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:17:59,520] Trial 42 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 53, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:18:23,067] Trial 43 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 188, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:18:48,610] Trial 44 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 108, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:19:06,005] Trial 45 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 80, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:19:40,389] Trial 46 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 140, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:20:02,765] Trial 47 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 91, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:20:36,555] Trial 48 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 136, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:20:56,051] Trial 49 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 128, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:21:05,627] Trial 50 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 56, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:21:11,063] Trial 51 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 38, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:21:37,093] Trial 52 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 118, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:21:58,037] Trial 53 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 93, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:22:12,217] Trial 54 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 70, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:22:27,019] Trial 55 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 74, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:22:37,745] Trial 56 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 62, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:22:48,234] Trial 57 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 62, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:22:53,668] Trial 58 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 32, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:22:56,961] Trial 59 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 17, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:23:05,511] Trial 60 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 44, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:23:14,481] Trial 61 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 46, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:23:30,662] Trial 62 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 166, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:23:51,155] Trial 63 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 158, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:24:12,943] Trial 64 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 172, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:24:46,932] Trial 65 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 182, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:25:02,535] Trial 66 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 158, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:25:37,777] Trial 67 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 167, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:26:01,913] Trial 68 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 191, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:26:33,022] Trial 69 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 152, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:27:02,553] Trial 70 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 199, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:27:22,075] Trial 71 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 137, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:27:42,520] Trial 72 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 143, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:28:11,539] Trial 73 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 134, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:28:32,162] Trial 74 finished with value: 0.0 and parameters: {'max_depth': 7, 'n_estimators': 96, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:28:44,505] Trial 75 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 87, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:29:02,028] Trial 76 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 119, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:29:10,073] Trial 77 finished with value: 0.0 and parameters: {'max_depth': 4, 'n_estimators': 56, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:29:23,036] Trial 78 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 75, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:29:32,661] Trial 79 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 57, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:29:45,009] Trial 80 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 74, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:29:50,779] Trial 81 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 35, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:29:53,787] Trial 82 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 15, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:29:59,972] Trial 83 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 33, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:30:08,309] Trial 84 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 44, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:30:19,759] Trial 85 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 61, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:30:28,572] Trial 86 finished with value: 0.0 and parameters: {'max_depth': 6, 'n_estimators': 47, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:30:33,879] Trial 87 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 30, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:30:35,493] Trial 88 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 17, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:30:43,930] Trial 89 finished with value: 0.0 and parameters: {'max_depth': 5, 'n_estimators': 48, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:30:48,890] Trial 90 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 42, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:31:05,502] Trial 91 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 180, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:31:19,946] Trial 92 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 157, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:31:34,997] Trial 93 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 163, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:31:50,571] Trial 94 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 170, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:32:12,348] Trial 95 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 189, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:32:30,043] Trial 96 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 150, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:32:51,235] Trial 97 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 174, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:33:11,161] Trial 98 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 165, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.0.
[I 2020-11-28 23:33:29,144] Trial 99 finished with value: 0.0 and parameters: {'max_depth': 3, 'n_estimators': 147, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.0.
Optimized RF accuracy:  0.876571068720317
Optimized RF f1-score: 0.0
In [58]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(7),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(30, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    model.compile(
        loss=tf.keras.losses.binary_crossentropy, 
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[keras_f1_score]
    )
    return model
In [60]:
class_weight = {
    0: 1.,
    1: 8.
}
In [61]:
model = create_model()
model.fit(X_train, y_nn_train, validation_split=0.2, epochs=35, batch_size=256, verbose=2, class_weight=class_weight)
Epoch 1/35
953/953 - 2s - loss: 0.9532 - keras_f1_score: 0.6504 - val_loss: 0.5524 - val_keras_f1_score: 0.6382
Epoch 2/35
953/953 - 2s - loss: 0.8204 - keras_f1_score: 0.6696 - val_loss: 0.5035 - val_keras_f1_score: 0.6540
Epoch 3/35
953/953 - 2s - loss: 0.8107 - keras_f1_score: 0.6734 - val_loss: 0.5313 - val_keras_f1_score: 0.6708
Epoch 4/35
953/953 - 2s - loss: 0.8066 - keras_f1_score: 0.6750 - val_loss: 0.4840 - val_keras_f1_score: 0.6830
Epoch 5/35
953/953 - 2s - loss: 0.8054 - keras_f1_score: 0.6760 - val_loss: 0.5105 - val_keras_f1_score: 0.6792
Epoch 6/35
953/953 - 2s - loss: 0.8027 - keras_f1_score: 0.6754 - val_loss: 0.4814 - val_keras_f1_score: 0.6916
Epoch 7/35
953/953 - 2s - loss: 0.8008 - keras_f1_score: 0.6765 - val_loss: 0.4944 - val_keras_f1_score: 0.6783
Epoch 8/35
953/953 - 2s - loss: 0.8006 - keras_f1_score: 0.6763 - val_loss: 0.5211 - val_keras_f1_score: 0.6653
Epoch 9/35
953/953 - 2s - loss: 0.7995 - keras_f1_score: 0.6764 - val_loss: 0.5019 - val_keras_f1_score: 0.6718
Epoch 10/35
953/953 - 2s - loss: 0.7999 - keras_f1_score: 0.6764 - val_loss: 0.5066 - val_keras_f1_score: 0.6750
Epoch 11/35
953/953 - 2s - loss: 0.7978 - keras_f1_score: 0.6778 - val_loss: 0.5176 - val_keras_f1_score: 0.6732
Epoch 12/35
953/953 - 2s - loss: 0.7989 - keras_f1_score: 0.6773 - val_loss: 0.4879 - val_keras_f1_score: 0.6752
Epoch 13/35
953/953 - 2s - loss: 0.7991 - keras_f1_score: 0.6764 - val_loss: 0.5006 - val_keras_f1_score: 0.6753
Epoch 14/35
953/953 - 2s - loss: 0.7974 - keras_f1_score: 0.6789 - val_loss: 0.4992 - val_keras_f1_score: 0.6791
Epoch 15/35
953/953 - 2s - loss: 0.7977 - keras_f1_score: 0.6765 - val_loss: 0.5017 - val_keras_f1_score: 0.6854
Epoch 16/35
953/953 - 2s - loss: 0.7971 - keras_f1_score: 0.6763 - val_loss: 0.4859 - val_keras_f1_score: 0.6839
Epoch 17/35
953/953 - 2s - loss: 0.7979 - keras_f1_score: 0.6772 - val_loss: 0.5105 - val_keras_f1_score: 0.6653
Epoch 18/35
953/953 - 2s - loss: 0.7977 - keras_f1_score: 0.6759 - val_loss: 0.5035 - val_keras_f1_score: 0.6846
Epoch 19/35
953/953 - 2s - loss: 0.7970 - keras_f1_score: 0.6756 - val_loss: 0.5063 - val_keras_f1_score: 0.6826
Epoch 20/35
953/953 - 2s - loss: 0.7960 - keras_f1_score: 0.6775 - val_loss: 0.5181 - val_keras_f1_score: 0.6737
Epoch 21/35
953/953 - 2s - loss: 0.7968 - keras_f1_score: 0.6760 - val_loss: 0.5126 - val_keras_f1_score: 0.6764
Epoch 22/35
953/953 - 2s - loss: 0.7966 - keras_f1_score: 0.6772 - val_loss: 0.4837 - val_keras_f1_score: 0.6871
Epoch 23/35
953/953 - 2s - loss: 0.7965 - keras_f1_score: 0.6777 - val_loss: 0.4970 - val_keras_f1_score: 0.6757
Epoch 24/35
953/953 - 2s - loss: 0.7959 - keras_f1_score: 0.6775 - val_loss: 0.5004 - val_keras_f1_score: 0.6742
Epoch 25/35
953/953 - 2s - loss: 0.7962 - keras_f1_score: 0.6765 - val_loss: 0.4828 - val_keras_f1_score: 0.6864
Epoch 26/35
953/953 - 2s - loss: 0.7967 - keras_f1_score: 0.6765 - val_loss: 0.4907 - val_keras_f1_score: 0.6969
Epoch 27/35
953/953 - 2s - loss: 0.7960 - keras_f1_score: 0.6780 - val_loss: 0.5046 - val_keras_f1_score: 0.6728
Epoch 28/35
953/953 - 2s - loss: 0.7972 - keras_f1_score: 0.6770 - val_loss: 0.4907 - val_keras_f1_score: 0.6698
Epoch 29/35
953/953 - 2s - loss: 0.7959 - keras_f1_score: 0.6768 - val_loss: 0.5109 - val_keras_f1_score: 0.6740
Epoch 30/35
953/953 - 2s - loss: 0.7956 - keras_f1_score: 0.6771 - val_loss: 0.4999 - val_keras_f1_score: 0.6855
Epoch 31/35
953/953 - 2s - loss: 0.7959 - keras_f1_score: 0.6766 - val_loss: 0.5004 - val_keras_f1_score: 0.6816
Epoch 32/35
953/953 - 2s - loss: 0.7953 - keras_f1_score: 0.6769 - val_loss: 0.5021 - val_keras_f1_score: 0.6768
Epoch 33/35
953/953 - 2s - loss: 0.7960 - keras_f1_score: 0.6769 - val_loss: 0.4781 - val_keras_f1_score: 0.6948
Epoch 34/35
953/953 - 2s - loss: 0.7955 - keras_f1_score: 0.6778 - val_loss: 0.4937 - val_keras_f1_score: 0.6761
Epoch 35/35
953/953 - 2s - loss: 0.7943 - keras_f1_score: 0.6781 - val_loss: 0.5155 - val_keras_f1_score: 0.6761
Out[61]:
<tensorflow.python.keras.callbacks.History at 0x7f2b2e042d90>
In [62]:
In [63]:
print('NN accuracy: ', accuracy_score(y_test, preds))
print('NN f1-score', f1_score(y_test, preds))
NN accuracy:  0.677625882291202
NN f1-score 0.42104519108430327